
## Prepare Final Dataset

rm(list=ls())
options(scipen=100,digits=10)
library(raster)
library(plyr)
library(dplyr)
library(lfe)
library(ggplot2)
library(stargazer)


########################
# Load the integrated dataset
load("Data/processed data/Data_Combined")

########################
# Generate weekdays/weekends variable
data$weekday=weekdays(data$DATE)

########################
# Define year label 
data$Year=NA
data$Year[which(data$DATE>=as.Date("2013-10-22")&data$DATE<=as.Date("2014-03-25"))]=2013
data$Year[which(data$DATE>=as.Date("2014-10-22")&data$DATE<=as.Date("2015-03-25"))]=2014
data$Year[which(data$DATE>=as.Date("2015-10-22")&data$DATE<=as.Date("2016-03-25"))]=2015
data$Year[which(data$DATE>=as.Date("2016-10-22")&data$DATE<=as.Date("2017-03-25"))]=2016
data$Year[which(data$DATE>=as.Date("2017-10-22")&data$DATE<=as.Date("2018-03-25"))]=2017
data$Year[which(data$DATE>=as.Date("2018-10-22")&data$DATE<=as.Date("2019-03-25"))]=2018
data$SO2_Rate_KG_Kwh=data$SO2_MASS..tons./data$GLOAD..MWh.*1000
data$NOX_Rate_KG_Kwh=data$NOX_MASS..tons./data$GLOAD..MWh.*1000
data$CO2_Rate_KG_Kwh=data$CO2_MASS..tons./data$GLOAD..MWh.*1000

########################
# Generate EPA regions
Region1=c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont" )
Region2=c("New Jersey","New York")
Region3=c("Delaware", "District of Columbia", "Maryland", "Pennsylvania", "Virginia", "West Virginia")
Region4=c("Alabama", "Florida", "Georgia", "Kentucky", "Mississippi", "North Carolina", "South Carolina", "Tennessee")
Region5=c("Illinois","Indiana","Michigan","Minnesota","Ohio","Wisconsin")
Region6=c("Arkansas", "Louisiana", "New Mexico", "Oklahoma", "Texas")
Region7=c("Iowa", "Kansas", "Missouri", "Nebraska")
Region8=c("Colorado", "Montana", "North Dakota", "South Dakota", "Utah", "Wyominga")
Region9=c("Arizona", "California", "Hawaii", "Nevada")
Region10=c("Alaska", "Idaho", "Oregon", "Washingtona")
data$Region=0
data$Region[which(data$StateName%in%Region1)]=1
data$Region[which(data$StateName%in%Region2)]=2
data$Region[which(data$StateName%in%Region3)]=3
data$Region[which(data$StateName%in%Region4)]=4
data$Region[which(data$StateName%in%Region5)]=5
data$Region[which(data$StateName%in%Region6)]=6
data$Region[which(data$StateName%in%Region7)]=7
data$Region[which(data$StateName%in%Region8)]=8
data$Region[which(data$StateName%in%Region9)]=9
data$Region[which(data$StateName%in%Region10)]=10


########################
# Define time dimension
data$T[which(data$DATE=="2016-02-29")]=131
# Fix the Leap year
data$T[which(data$DATE>"2016-02-29"&data$DATE<="2016-03-25")]=data$T[which(data$DATE>"2016-02-29"&data$DATE<="2016-03-25")]+1


########################
# Merge inspection data by inspection penalty source
Inspect_Source=read.csv("Data/raw data/Inspection_Penalty_Source.csv")
# If there are some non-NA, the plant is matched by RegistryID. NA means zero inspection record.
Inspect_Source=subset(Inspect_Source, !(is.na(State_Formal))|!(is.na(State_Informal)))
Inspect_Source$EPA_Formal[which(is.na(Inspect_Source$EPA_Formal))]=0
Inspect_Source$State_Formal[which(is.na(Inspect_Source$State_Formal))]=0
Inspect_Source$LCR_Formal[which(is.na(Inspect_Source$LCR_Formal))]=0
Inspect_Source$EPA_Informal[which(is.na(Inspect_Source$EPA_Informal))]=0
Inspect_Source$State_Informal[which(is.na(Inspect_Source$State_Informal))]=0
Inspect_Source$LCR_Informal[which(is.na(Inspect_Source$LCR_Informal))]=0
data=left_join(data, Inspect_Source, by=c("Plant_Code"))


########################
# Generate post-shutdown dummy
data$post=0
data$post[which(data$DATE>"2019-01-27")]=1




########################
# Filter the data to power plants using coal as the primary source
list=c("coal")
list="coal"
data1=subset(data, PrimSource==list)

########################
# Generate shutdown and post-shutdown dummy
data1$shutdown=0
data1$shutdown[which( data1$DATE>="2018-12-29" & data1$DATE<="2019-01-27")]=1
data1$post=0
data1$post[which(data1$DATE>"2019-01-27")]=1

########################
# Generate group dummy based on whehter in shutdown year
data1$Group="Control"
data1$Group[which(data1$Year==2018)]="Treated"

########################
# Modify the units
data1$k_MWH=data1$GLOAD..MWh./1000
data1$k_mmBtu=data1$HEAT_INPUT..mmBtu./1000
data1$log_MWH=log(data1$GLOAD..MWh.+1)
data1$log_mmBtu=log(data1$k_mmBtu+1)

########################
# Modify the unit
data1$heat=data1$HEAT_INPUT..mmBtu./1000

########################
# Delete hybrid and inactive power plants 
data2=subset(data1, Total_MW==Coal_MW)
data3=subset(data2, HEAT_INPUT..mmBtu.>0)

########################
# Define weeks
data3$weeks=0
tt=8
for (t in 1:21) {
  tt=tt+7
  data3$weeks[which(data3$T>=(tt-7)&data3$T<tt)]=t+1
}
# Fill in the first week
data3$weeks[which(data3$T<8)]=1
# The last week has 8 days
data3$weeks[which(data3$T>154)]=22


########################
# Save the final dataset
save(data3, file = "data_final.RData")
